# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
%matplotlib inline
#reading the vechile data
vechiledata = pd.read_csv(r'D:\SAI\MECH\Great Learning\Study\UNSupervisedLearning\Project\vehicledata.csv')
vechiledata.head()
#Understanding shape of data
vechiledata.shape
The data shows it has 846 rows & 19 columns
#get detail information
vechiledata.info()
All the attributes are of int,float except class is of category
#class attribute is not an object it is a category
vechiledata['class']=vechiledata['class'].astype('category')
#get the summary of data
vechiledata.describe().transpose()
#Checking for missing values in the dataset
vechiledata.isnull().sum()
Some attributes are having missing values replacing in median
#replace missing variable('?') into null variable using numpy
vechiledata = vechiledata.replace(' ', np.nan)
#Replacing the missing values by median
for i in vechiledata.columns[:17]:
median_value = vechiledata[i].median()
vechiledata[i] = vechiledata[i].fillna(median_value)
# Again checking for missing values in the dataset
vechiledata.isnull().sum()
#graphical representation Histplot
vechiledata.hist(figsize=(15,15));
# A quick check to find columns that contain outliers
fig = plt.figure(figsize = (15, 7.2))
ax = sns.boxplot(data = vechiledata.iloc[:, 0:18], orient = 'h')
vechiledata['class'].unique()
vechiledata['class'].value_counts()
sns.countplot(vechiledata['class']);
groupby=vechiledata.groupby('class')
groupby.mean()
#importing the Encoding library
from sklearn.preprocessing import LabelEncoder
#Encoding of categorical variables
labelencoder_X=LabelEncoder()
vechiledata['class']=labelencoder_X.fit_transform(vechiledata['class'])
# kde plots to show the distribution of the all the variables with respect to dependent variable
k=1
plt.figure(figsize=(20,30))
for col in vechiledata.columns[0:18]:
plt.subplot(5,4,k)
sns.kdeplot(vechiledata[vechiledata['class']==0][col],color='red',label='car',shade=True)
sns.kdeplot(vechiledata[vechiledata['class']==1][col],color='blue',label='bus',shade=True)
sns.kdeplot(vechiledata[vechiledata['class']==2][col],color='yellow',label='van',shade=True)
plt.title(col)
k=k+1
#correlation matrix
cor=vechiledata.corr()
cor
#graphical representation of heatmap
plt.subplots(figsize=(10,8))
sns.heatmap(cor,annot=True,linewidths=.5,center=0,cbar=False,cmap="YlGnBu");
#Pair plot that includes all the columns of the data frame
sns.pairplot(vechiledata,hue='class');
## Creating a copy of dataframe for manipulation
vechiledata_split = vechiledata
vechiledata_split.head()
##Importing training and test set split
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
## Define X and y variables
X = vechiledata_split.drop('class',axis=1)
y = vechiledata_split['class']
#importing the zscore for scaling
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
#splitting the data to 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.30, random_state=10)
# Import the metrics
from sklearn import metrics
# Import Support Vector Classifier machine learning library
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# Building a Support Vector Machine on train data
svc_model = SVC()
svc_model.fit(X_train, y_train)
ypred_SVM = svc_model.predict(X_test)
# check the accuracy on the training set
print('Accuracy of SVM model on train set: {:.2f}'.format(svc_model.score(X_train, y_train)))
print('Accuracy of SVM model on train set: {:.2f}'.format(svc_model.score(X_test, y_test)))
print(classification_report(y_test, ypred_SVM))
# Importing libraries KFold,cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 10
seed = 10
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True);
model = SVC()
results = cross_val_score(model, XScaled, y, cv=kfold);
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
#Importing PCA for dimensionality reduction and visualization
from sklearn.decomposition import PCA
#covarience matrix of scaled data
covMatrix = np.cov(XScaled,rowvar=False)
print(covMatrix)
#Finding eigenvalues amd eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covMatrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Make a set of (eigenvalue, eigenvector) pairs
eigen_pairs = [(np.abs(eigenvalues[i]), eigenvectors[:,i]) for i in range(len(eigenvalues))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:]
# print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigenvalues)
tot = sum(eigenvalues)
var_exp = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(8 , 7))
plt.bar(range(1, eigenvalues.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
#lets fit pca with 5 attributes
pca3 = PCA(n_components=7)
pca3.fit(XScaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(XScaled)
Xpca3 #pca applied data
X_train, X_test, Y_train, Y_test = train_test_split(Xpca3, y, test_size=0.30, random_state=10)
# Building a Support Vector Machine on train data
svc_model = SVC()
svc_model.fit(X_train, y_train)
ypca_SVM = svc_model.predict(X_test)
# check the accuracy on the training set
print('Accuracy of SVM model on train set: {:.2f}'.format(svc_model.score(X_train, y_train)))
print('Accuracy of SVM model on train set: {:.2f}'.format(svc_model.score(X_test, y_test)))
print(classification_report(y_test, ypca_SVM))
num_folds = 10
seed = 10
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True);
model_pca = SVC()
results = cross_val_score(model_pca,Xpca3, y, cv=kfold);
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
We can say SVM model & K-fold cross validation with 18 attributes gives better accuracy compare to PCA on 7 attributes.
Applying PCA although loses some information but it gives dimension reduction for attributes doesn't have any impact on model.
Multicolinearity and Curse of dimensionality adversly impact any machine learning model,with Curse of dimensionality because of the feature space becoming increasingly sparse for an increasing number of dimensions of a fixed-size training dataset, model tend to overfit.
Principal Component Analyis helps adressing these problem and improves the model performance to a great extent.